DSAN 5100 Hypothesis Testing

Author

Emily Mitchum

Imports

library(stats)
library(ggplot2)
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

Read in Clean Data

earnings_df <-read.csv('/Users/Emi/Library/CloudStorage/GoogleDrive-emilymitchum265@gmail.com/Shared drives/DSAN/DSAN 5100/Final Project/vocational_school_employment_outcomes/Cleaned data/Cleaned_quarterly_earnings_data.csv')
head(earnings_df)
        date education_level median_weekly_earnings
1 2019-01-01         ba_only                   1213
2 2019-04-01         ba_only                   1236
3 2019-07-01         ba_only                   1281
4 2019-10-01         ba_only                   1259
5 2020-01-01         ba_only                   1263
6 2020-04-01         ba_only                   1303
tuition_df <- read.csv('/Users/Emi/Library/CloudStorage/GoogleDrive-emilymitchum265@gmail.com/Shared drives/DSAN/DSAN 5100/Final Project/vocational_school_employment_outcomes/Cleaned data/Cleaned_tuition_data.csv')
head(tuition_df)
  Academic.Year average_tuition     degree_granting year_parsed
1       2019-20           15747 Non-Degree Granting  2019-01-01
2       2019-20           25447     Degree Granting  2019-01-01
3       2020-21           26084     Degree Granting  2020-01-01
4       2020-21           15755 Non-Degree Granting  2020-01-01
5       2021-22           26572     Degree Granting  2021-01-01
6       2021-22           15978 Non-Degree Granting  2021-01-01
  public_or_private
1            public
2            public
3            public
4            public
5            public
6            public
emp_ratio_df <- read.csv('/Users/Emi/Library/CloudStorage/GoogleDrive-emilymitchum265@gmail.com/Shared drives/DSAN/DSAN 5100/Final Project/vocational_school_employment_outcomes/Cleaned data/Cleaned_monthly_emp_pop_ratio.csv')
head(emp_ratio_df)
        date education_level employment_population_ratio
1 2019-01-01         ba_plus                        72.1
2 2019-02-01         ba_plus                        72.1
3 2019-03-01         ba_plus                        72.2
4 2019-04-01         ba_plus                        72.3
5 2019-05-01         ba_plus                        72.2
6 2019-06-01         ba_plus                        72.3

ANOVA on Quarterly Median Weekly Earnings Data

# Correct data types before ANOVA
earnings_df <- earnings_df %>%
  mutate(
    year = factor(format(as.Date(date), "%Y")),
    education_level = factor(education_level)
  )

tuition_df <- tuition_df |> mutate(year = factor(format(as.Date(year_parsed), "%Y")))

Visual Analysis of Earnings by Education Level

library(ggplot2)

ggplot(earnings_df, aes(
  x = education_level,
  y = median_weekly_earnings,
  fill = education_level
)) +
  geom_boxplot(alpha = 0.85, outlier.color = "gray40") +
  scale_fill_brewer(palette = "Set2") +   # cleaner, professional palette
  labs(
    title = "Median Weekly Earnings by Education Level",
    x = "Education Level",
    y = "Median Weekly Earnings"
  ) +
  theme_minimal(base_size = 16) +         # larger base text for slides
  theme(
    text = element_text(family = "Helvetica", face = "bold"),
    plot.title = element_text(size = 22, face = "bold", hjust = 0.5),
    axis.title.x = element_text(size = 18, face = "bold"),
    axis.title.y = element_text(size = 18, face = "bold"),
    axis.text.x = element_text(size = 14, face = "bold", angle = 20, hjust = 1),
    axis.text.y = element_text(size = 14, face = "bold"),
    legend.position = "none",             # removes redundant legend
    panel.grid.major.x = element_blank()  # cleaner slide aesthetic
  )

One-Way ANOVA (Ignoring Time)

Check for equal variances

library(car)
Loading required package: carData

Attaching package: 'car'
The following object is masked from 'package:dplyr':

    recode
leveneTest(median_weekly_earnings ~ education_level, data = earnings_df)
Levene's Test for Homogeneity of Variance (center = median)
      Df F value   Pr(>F)   
group  2  6.9734 0.001669 **
      75                    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
leveneTest(median_weekly_earnings ~ education_level * year, data = earnings_df)
Levene's Test for Homogeneity of Variance (center = median)
      Df F value Pr(>F)
group 20  1.2631 0.2415
      57               
oneway <- aov(median_weekly_earnings ~ education_level, data = earnings_df)
summary(oneway)
                Df  Sum Sq Mean Sq F value Pr(>F)    
education_level  2 4505007 2252503   264.3 <2e-16 ***
Residuals       75  639267    8524                   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The one-way ANOVA test revealed a highly significant effect of education level on median weekly earnings. This indicates that mean earnings differ substantially across the three education groups.

# Welch's ANOVA
oneway.test(median_weekly_earnings ~ education_level, data = earnings_df)

    One-way analysis of means (not assuming equal variances)

data:  median_weekly_earnings and education_level
F = 207.99, num df = 2.000, denom df = 48.517, p-value < 2.2e-16

Two-Way ANOVA (Analyzing Effect of Education Level & Time)

model_two_way <- aov(median_weekly_earnings ~ education_level * year, data = earnings_df)

summary(model_two_way)
                     Df  Sum Sq Mean Sq  F value   Pr(>F)    
education_level       2 4505007 2252503 4339.900  < 2e-16 ***
year                  6  578857   96476  185.881  < 2e-16 ***
education_level:year 12   30826    2569    4.949 1.54e-05 ***
Residuals            57   29584     519                      
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Bachelor’s degree holders consistently earn much more than those with some college/associate degrees, who in turn earn more than those with only a high school diploma.

#Games-Howell Test
library(rstatix)

Attaching package: 'rstatix'
The following object is masked from 'package:stats':

    filter
earnings_df |> games_howell_test(median_weekly_earnings ~ education_level)
# A tibble: 3 × 8
  .y.            group1 group2 estimate conf.low conf.high    p.adj p.adj.signif
* <chr>          <chr>  <chr>     <dbl>    <dbl>     <dbl>    <dbl> <chr>       
1 median_weekly… ba_on… high_…    -559.   -625.      -492. 1.03e-12 ****        
2 median_weekly… ba_on… some_…    -440.   -507.      -372. 1.04e-12 ****        
3 median_weekly… high_… some_…     119.     68.1      170. 2.37e- 6 ****        

ROI Analysis

limited_earnings_df <- earnings_df[earnings_df$education_level != "high_school",] 

limited_earnings_df <- limited_earnings_df |> 
  mutate(degree_category = case_when(
    education_level == "ba_only" ~ "Bachelor's",education_level == "some_college_assoc" ~ "Vocational",
    TRUE ~ "NA"
  ))

tuition_df <- tuition_df |> 
  mutate(degree_category = case_when(
    degree_granting == "Degree Granting" ~ "Bachelor's",
    degree_granting == "Non-Degree Granting" ~ "Vocational",
    TRUE ~ "NA"
  ))

Join Dataframes together to assess ROI

temp_earnings_df <- limited_earnings_df |> select(c("median_weekly_earnings","year","degree_category"))
temp_tuition_df <- tuition_df |> select(c("average_tuition","year","degree_category"))

merged_df <- merge(temp_earnings_df, temp_tuition_df, by = c("degree_category","year"))

roi_df <- merged_df |> mutate( ROI = median_weekly_earnings / average_tuition )

roi_yearly <- roi_df %>%
  group_by(degree_category, year) %>%
  summarize(ROI = mean(ROI), .groups = "drop") %>%
  arrange(degree_category, year)

roi_clean <- merged_df |>
  dplyr::group_by(degree_category, year) |>
  dplyr::summarise(
    annual_earnings = mean(median_weekly_earnings) * 52,
    tuition  = mean(average_tuition),
    ROI      = annual_earnings / tuition,  # annualized ROI
    .groups = "drop"
  )
library(plotly)

Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':

    last_plot
The following object is masked from 'package:stats':

    filter
The following object is masked from 'package:graphics':

    layout
plot <- plot_ly(roi_clean, x = ~year) %>%
  add_trace(
    y = ~ROI,
    color = ~degree_category,
    colors = c("Bachelor's" = "#E64B35", "Vocational" = "#4DBBD5"),
    type = "scatter",
    mode = "lines+markers",
    name = ~paste(degree_category, "ROI"),
    yaxis = "y"
  ) %>%

  add_trace(
    y = ~annual_earnings,
    x = ~year,
    split = ~degree_category,
    type = "bar",
    name = ~paste(degree_category, "Earnings"),
    opacity = 0.4,
    yaxis = "y2",
    showlegend = TRUE
  ) %>%

  add_trace(
    y = ~tuition,
    x = ~year,
    split = ~degree_category,
    type = "bar",
    name = ~paste(degree_category, "Tuition"),
    opacity = 0.4,
    yaxis = "y2",
    showlegend = TRUE
  ) %>%
  layout(
    title = "ROI with Earnings and Tuition Over Time",
    barmode = "group",
    xaxis = list(title = "Year"),
    yaxis = list(
      title = "ROI (Earnings / Tuition)",
      rangemode = "tozero"
    ),
    yaxis2 = list(
      title = "Dollars (Earnings & Tuition)",
      overlaying = "y",
      side = "right",
      rangemode = "tozero"
    ),
    legend = list(title = list(text = "<b>Series</b>"))
  )

# htmlwidgets::saveWidget(plot, "roi_plot.html")

plot

Time Series Analysis Using Employment Population Ratio

emp_ratio_df$date <- as.Date(emp_ratio_df$date, "%Y-%m-%d")
emp_ratio_df$employment_population_ratio <- as.numeric(emp_ratio_df$employment_population_ratio)
max <- max(emp_ratio_df$employment_population_ratio)

fig <- plot_ly(
  emp_ratio_df,
  x = ~date,
  y = ~employment_population_ratio,
  color = ~education_level,        
  colors = c(
    "ba_plus" = "#E64B35",
    "some_college_assoc" = "#4DBBD5",
    "high_school" = "#00A087"
  ),
  type = 'scatter',
  mode = 'lines'
)

fig <- fig |>
  layout(
    title = "Employment Population Ratio (2019–2025)",
    xaxis = list(title = "Date"),
    yaxis = list(title = "Employment-Population Ratio"),
    legend = list(title = list(text = "<b>Education Level</b>")),
    shapes = list(
      list(type = 'rect',
           x0 = "2019-12-01", x1 = "2020-12-01",
           y0 = 40, y1 = max,
           line = list(color = 'rgba(0, 0, 255, 0.5)', width = 2),
           fillcolor = 'rgba(0, 0, 255, 0.2)')
    ),
    annotations = list(
      list(
        x = "2020-06-01",
        y = max + 2,
        text = "COVID-19",
        showarrow = FALSE,
        font = list(color = 'black', size = 12)
      )
  )
)


fig